import numpy as np
import tensorflow.compat.v2 as tf
tf.enable_v2_behavior()
import pandas as pd
from tensorflow import keras
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import MinMaxScaler
from matplotlib import pyplot
import plotly.graph_objects as go
import math
import seaborn as sns
from sklearn.metrics import mean_squared_error
np.random.seed(1)
tf.random.set_seed(1)
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, GRU, Dropout, RepeatVector, TimeDistributed
from keras import backend
MODELFILENAME = 'MODELS/GRU_6h_TFM'
TIME_STEPS=36 #6h
CMODEL = GRU
MODEL = "GRU"
UNITS=55
DROPOUT=0.118
ACTIVATION='tanh'
OPTIMIZER='adamax'
EPOCHS=36
BATCHSIZE=9
VALIDATIONSPLIT=0.2
# Code to read csv file into Colaboratory:
# from google.colab import files
# uploaded = files.upload()
# import io
# df = pd.read_csv(io.BytesIO(uploaded['SentDATA.csv']))
# Dataset is now stored in a Pandas Dataframe
df = pd.read_csv('../../data/dadesTFM.csv')
df.reset_index(inplace=True)
df['Time'] = pd.to_datetime(df['Time'])
df = df.set_index('Time')
columns = ['PM1','PM25','PM10','PM1ATM','PM25ATM','PM10ATM']
df1 = df.copy();
df1 = df1.rename(columns={"PM 1":"PM1","PM 2.5":"PM25","PM 10":"PM10","PM 1 ATM":"PM1ATM","PM 2.5 ATM":"PM25ATM","PM 10 ATM":"PM10ATM"})
df1['PM1'] = df['PM 1'].astype(np.float32)
df1['PM25'] = df['PM 2.5'].astype(np.float32)
df1['PM10'] = df['PM 10'].astype(np.float32)
df1['PM1ATM'] = df['PM 1 ATM'].astype(np.float32)
df1['PM25ATM'] = df['PM 2.5 ATM'].astype(np.float32)
df1['PM10ATM'] = df['PM 10 ATM'].astype(np.float32)
df2 = df1.copy()
train_size = int(len(df2) * 0.8)
test_size = len(df2) - train_size
train, test = df2.iloc[0:train_size], df2.iloc[train_size:len(df2)]
train.shape, test.shape
((3117, 7), (780, 7))
#Standardize the data
for col in columns:
scaler = StandardScaler()
train[col] = scaler.fit_transform(train[[col]])
<ipython-input-6-83cecdbc25f8>:4: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy train[col] = scaler.fit_transform(train[[col]]) <ipython-input-6-83cecdbc25f8>:4: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy train[col] = scaler.fit_transform(train[[col]]) <ipython-input-6-83cecdbc25f8>:4: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy train[col] = scaler.fit_transform(train[[col]]) <ipython-input-6-83cecdbc25f8>:4: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy train[col] = scaler.fit_transform(train[[col]]) <ipython-input-6-83cecdbc25f8>:4: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy train[col] = scaler.fit_transform(train[[col]]) <ipython-input-6-83cecdbc25f8>:4: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy train[col] = scaler.fit_transform(train[[col]])
def create_sequences(X, y, time_steps=TIME_STEPS):
Xs, ys = [], []
for i in range(len(X)-time_steps):
Xs.append(X.iloc[i:(i+time_steps)].values)
ys.append(y.iloc[i+time_steps])
return np.array(Xs), np.array(ys)
X_train, y_train = create_sequences(train[[columns[1]]], train[columns[1]])
#X_test, y_test = create_sequences(test[[columns[1]]], test[columns[1]])
print(f'X_train shape: {X_train.shape}')
print(f'y_train shape: {y_train.shape}')
X_train shape: (3081, 36, 1) y_train shape: (3081,)
#afegir nova mètrica
def rmse(y_true, y_pred):
return backend.sqrt(backend.mean(backend.square(y_pred - y_true), axis=-1))
model = Sequential()
model.add(CMODEL(units = UNITS, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(Dropout(rate=DROPOUT))
model.add(TimeDistributed(Dense(1,kernel_initializer='normal',activation=ACTIVATION)))
model.compile(optimizer=OPTIMIZER, loss='mae',metrics=['mse',rmse])
model.summary()
Model: "sequential" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= gru (GRU) (None, 36, 55) 9570 _________________________________________________________________ dropout (Dropout) (None, 36, 55) 0 _________________________________________________________________ time_distributed (TimeDistri (None, 36, 1) 56 ================================================================= Total params: 9,626 Trainable params: 9,626 Non-trainable params: 0 _________________________________________________________________
history = model.fit(X_train, y_train, epochs=EPOCHS, batch_size=BATCHSIZE, validation_split=VALIDATIONSPLIT,
callbacks=[keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, mode='min')], shuffle=False)
Epoch 1/36 274/274 [==============================] - 3s 12ms/step - loss: 0.5666 - mse: 0.6476 - rmse: 0.6005 - val_loss: 0.3060 - val_mse: 0.2890 - val_rmse: 0.3558 Epoch 2/36 274/274 [==============================] - 3s 9ms/step - loss: 0.5139 - mse: 0.5454 - rmse: 0.5584 - val_loss: 0.2957 - val_mse: 0.2806 - val_rmse: 0.3395 Epoch 3/36 274/274 [==============================] - 3s 10ms/step - loss: 0.5070 - mse: 0.5361 - rmse: 0.5539 - val_loss: 0.2908 - val_mse: 0.2774 - val_rmse: 0.3314 Epoch 4/36 274/274 [==============================] - 2s 9ms/step - loss: 0.5036 - mse: 0.5313 - rmse: 0.5517 - val_loss: 0.2885 - val_mse: 0.2759 - val_rmse: 0.3267 Epoch 5/36 274/274 [==============================] - 3s 9ms/step - loss: 0.5012 - mse: 0.5286 - rmse: 0.5501 - val_loss: 0.2878 - val_mse: 0.2752 - val_rmse: 0.3242 Epoch 6/36 274/274 [==============================] - 3s 9ms/step - loss: 0.4993 - mse: 0.5263 - rmse: 0.5488 - val_loss: 0.2872 - val_mse: 0.2747 - val_rmse: 0.3223 Epoch 7/36 274/274 [==============================] - 3s 10ms/step - loss: 0.4977 - mse: 0.5245 - rmse: 0.5476 - val_loss: 0.2861 - val_mse: 0.2741 - val_rmse: 0.3202 Epoch 8/36 274/274 [==============================] - 3s 9ms/step - loss: 0.4967 - mse: 0.5233 - rmse: 0.5468 - val_loss: 0.2866 - val_mse: 0.2739 - val_rmse: 0.3198 Epoch 9/36 274/274 [==============================] - 3s 9ms/step - loss: 0.4955 - mse: 0.5222 - rmse: 0.5459 - val_loss: 0.2864 - val_mse: 0.2736 - val_rmse: 0.3191 Epoch 10/36 274/274 [==============================] - 3s 10ms/step - loss: 0.4945 - mse: 0.5213 - rmse: 0.5452 - val_loss: 0.2855 - val_mse: 0.2732 - val_rmse: 0.3180 Epoch 11/36 274/274 [==============================] - 3s 10ms/step - loss: 0.4939 - mse: 0.5207 - rmse: 0.5449 - val_loss: 0.2859 - val_mse: 0.2732 - val_rmse: 0.3181 Epoch 12/36 274/274 [==============================] - 3s 10ms/step - loss: 0.4933 - mse: 0.5201 - rmse: 0.5444 - val_loss: 0.2855 - val_mse: 0.2729 - val_rmse: 0.3176 Epoch 13/36 274/274 [==============================] - 3s 10ms/step - loss: 0.4929 - mse: 0.5195 - rmse: 0.5441 - val_loss: 0.2853 - val_mse: 0.2728 - val_rmse: 0.3173 Epoch 14/36 274/274 [==============================] - 3s 10ms/step - loss: 0.4924 - mse: 0.5188 - rmse: 0.5438 - val_loss: 0.2853 - val_mse: 0.2727 - val_rmse: 0.3173 Epoch 15/36 274/274 [==============================] - 3s 10ms/step - loss: 0.4920 - mse: 0.5186 - rmse: 0.5437 - val_loss: 0.2854 - val_mse: 0.2726 - val_rmse: 0.3173 Epoch 16/36 274/274 [==============================] - 3s 10ms/step - loss: 0.4916 - mse: 0.5179 - rmse: 0.5434 - val_loss: 0.2850 - val_mse: 0.2723 - val_rmse: 0.3168 Epoch 17/36 274/274 [==============================] - 3s 10ms/step - loss: 0.4914 - mse: 0.5178 - rmse: 0.5433 - val_loss: 0.2846 - val_mse: 0.2722 - val_rmse: 0.3165 Epoch 18/36 274/274 [==============================] - 3s 10ms/step - loss: 0.4910 - mse: 0.5174 - rmse: 0.5432 - val_loss: 0.2844 - val_mse: 0.2720 - val_rmse: 0.3162 Epoch 19/36 274/274 [==============================] - 2s 9ms/step - loss: 0.4906 - mse: 0.5170 - rmse: 0.5429 - val_loss: 0.2844 - val_mse: 0.2719 - val_rmse: 0.3161 Epoch 20/36 274/274 [==============================] - 3s 9ms/step - loss: 0.4904 - mse: 0.5168 - rmse: 0.5428 - val_loss: 0.2837 - val_mse: 0.2716 - val_rmse: 0.3153 Epoch 21/36 274/274 [==============================] - 3s 9ms/step - loss: 0.4901 - mse: 0.5164 - rmse: 0.5426 - val_loss: 0.2835 - val_mse: 0.2715 - val_rmse: 0.3151 Epoch 22/36 274/274 [==============================] - 3s 10ms/step - loss: 0.4899 - mse: 0.5160 - rmse: 0.5422 - val_loss: 0.2836 - val_mse: 0.2714 - val_rmse: 0.3150 Epoch 23/36 274/274 [==============================] - 3s 10ms/step - loss: 0.4895 - mse: 0.5156 - rmse: 0.5419 - val_loss: 0.2831 - val_mse: 0.2712 - val_rmse: 0.3145 Epoch 24/36 274/274 [==============================] - 2s 9ms/step - loss: 0.4892 - mse: 0.5155 - rmse: 0.5418 - val_loss: 0.2832 - val_mse: 0.2712 - val_rmse: 0.3147 Epoch 25/36 274/274 [==============================] - 3s 9ms/step - loss: 0.4889 - mse: 0.5151 - rmse: 0.5415 - val_loss: 0.2832 - val_mse: 0.2712 - val_rmse: 0.3146 Epoch 26/36 274/274 [==============================] - 3s 9ms/step - loss: 0.4885 - mse: 0.5146 - rmse: 0.5411 - val_loss: 0.2831 - val_mse: 0.2711 - val_rmse: 0.3145 Epoch 27/36 274/274 [==============================] - 3s 9ms/step - loss: 0.4883 - mse: 0.5146 - rmse: 0.5409 - val_loss: 0.2827 - val_mse: 0.2710 - val_rmse: 0.3142 Epoch 28/36 274/274 [==============================] - 3s 10ms/step - loss: 0.4881 - mse: 0.5144 - rmse: 0.5408 - val_loss: 0.2826 - val_mse: 0.2710 - val_rmse: 0.3142 Epoch 29/36 274/274 [==============================] - 3s 10ms/step - loss: 0.4878 - mse: 0.5140 - rmse: 0.5405 - val_loss: 0.2822 - val_mse: 0.2709 - val_rmse: 0.3139 Epoch 30/36 274/274 [==============================] - 3s 10ms/step - loss: 0.4875 - mse: 0.5137 - rmse: 0.5402 - val_loss: 0.2825 - val_mse: 0.2710 - val_rmse: 0.3141 Epoch 31/36 274/274 [==============================] - 3s 10ms/step - loss: 0.4873 - mse: 0.5137 - rmse: 0.5400 - val_loss: 0.2815 - val_mse: 0.2708 - val_rmse: 0.3133 Epoch 32/36 274/274 [==============================] - 3s 10ms/step - loss: 0.4871 - mse: 0.5135 - rmse: 0.5398 - val_loss: 0.2813 - val_mse: 0.2708 - val_rmse: 0.3132 Epoch 33/36 274/274 [==============================] - 3s 11ms/step - loss: 0.4866 - mse: 0.5131 - rmse: 0.5394 - val_loss: 0.2809 - val_mse: 0.2706 - val_rmse: 0.3128 Epoch 34/36 274/274 [==============================] - 3s 12ms/step - loss: 0.4866 - mse: 0.5130 - rmse: 0.5392 - val_loss: 0.2809 - val_mse: 0.2707 - val_rmse: 0.3128 Epoch 35/36 274/274 [==============================] - 3s 11ms/step - loss: 0.4863 - mse: 0.5129 - rmse: 0.5391 - val_loss: 0.2801 - val_mse: 0.2705 - val_rmse: 0.3122 Epoch 36/36 274/274 [==============================] - 3s 12ms/step - loss: 0.4861 - mse: 0.5125 - rmse: 0.5388 - val_loss: 0.2801 - val_mse: 0.2705 - val_rmse: 0.3121
import matplotlib.pyplot as plt
plt.plot(history.history['loss'], label='MAE Training loss')
plt.plot(history.history['val_loss'], label='MAE Validation loss')
plt.plot(history.history['mse'], label='MSE Training loss')
plt.plot(history.history['val_mse'], label='MSE Validation loss')
plt.plot(history.history['rmse'], label='RMSE Training loss')
plt.plot(history.history['val_rmse'], label='RMSE Validation loss')
plt.legend();
X_train_pred = model.predict(X_train, verbose=0)
train_mae_loss = np.mean(np.abs(X_train_pred - X_train), axis=1)
plt.hist(train_mae_loss, bins=50)
plt.xlabel('Train MAE loss')
plt.ylabel('Number of Samples');
def evaluate_prediction(predictions, actual, model_name):
errors = predictions - actual
mse = np.square(errors).mean()
rmse = np.sqrt(mse)
mae = np.abs(errors).mean()
print(model_name + ':')
print('Mean Absolute Error: {:.4f}'.format(mae))
print('Root Mean Square Error: {:.4f}'.format(rmse))
print('Mean Square Error: {:.4f}'.format(mse))
print('')
return mae,rmse,mse
mae,rmse,mse = evaluate_prediction(X_train_pred, X_train,MODEL)
GRU: Mean Absolute Error: 0.2083 Root Mean Square Error: 0.4410 Mean Square Error: 0.1945
model.save(MODELFILENAME+'.h5')
#càlcul del threshold de test
def calculate_threshold(X_test, X_test_pred):
distance = np.sqrt(np.mean(np.square(X_test_pred - X_test),axis=1))
"""Sorting the scores/diffs and using a 0.80 as cutoff value to pick the threshold"""
distance.sort();
cut_off = int(0.95 * len(distance));
threshold = distance[cut_off];
return threshold
for col in columns:
print ("####################### "+col +" ###########################")
#Standardize the test data
scaler = StandardScaler()
test_cpy = test.copy()
test[col] = scaler.fit_transform(test[[col]])
#creem seqüencia amb finestra temporal per les dades de test
X_test1, y_test1 = create_sequences(test[[col]], test[col])
print(f'Testing shape: {X_test1.shape}')
#evaluem el model
eval = model.evaluate(X_test1, y_test1)
print("evaluate: ",eval)
#predim el model
X_test1_pred = model.predict(X_test1, verbose=0)
evaluate_prediction(X_test1_pred, X_test1,MODEL)
#càlcul del mae_loss
test1_mae_loss = np.mean(np.abs(X_test1_pred - X_test1), axis=1)
test1_rmse_loss = np.sqrt(np.mean(np.square(X_test1_pred - X_test1),axis=1))
# reshaping test prediction
X_test1_predReshape = X_test1_pred.reshape((X_test1_pred.shape[0] * X_test1_pred.shape[1]), X_test1_pred.shape[2])
# reshaping test data
X_test1Reshape = X_test1.reshape((X_test1.shape[0] * X_test1.shape[1]), X_test1.shape[2])
threshold_test = calculate_threshold(X_test1Reshape,X_test1_predReshape)
test1_score_df = pd.DataFrame(test[TIME_STEPS:])
test1_score_df['loss'] = test1_rmse_loss.reshape((-1))
test1_score_df['threshold'] = threshold_test
test1_score_df['anomaly'] = test1_score_df['loss'] > test1_score_df['threshold']
test1_score_df[col] = test[TIME_STEPS:][col]
#gràfic test lost i threshold
fig = go.Figure()
fig.add_trace(go.Scatter(x=test1_score_df.index, y=test1_score_df['loss'], name='Test loss'))
fig.add_trace(go.Scatter(x=test1_score_df.index, y=test1_score_df['threshold'], name='Threshold'))
fig.update_layout(showlegend=True, title='Test loss vs. Threshold')
fig.show()
#Posem les anomalies en un array
anomalies1 = test1_score_df.loc[test1_score_df['anomaly'] == True]
anomalies1.shape
print('anomalies: ',anomalies1.shape); print();
#Gràfic dels punts i de les anomalíes amb els valors de dades transformades per verificar que la normalització que s'ha fet no distorssiona les dades
fig = go.Figure()
fig.add_trace(go.Scatter(x=test1_score_df.index, y=scaler.inverse_transform(test1_score_df[col]), name=col))
fig.add_trace(go.Scatter(x=anomalies1.index, y=scaler.inverse_transform(anomalies1[col]), mode='markers', name='Anomaly'))
fig.update_layout(showlegend=True, title='Detected anomalies')
fig.show()
print ("######################################################")
####################### PM1 ########################### Testing shape: (744, 36, 1) 18/24 [=====================>........] - ETA: 0s - loss: 0.4930 - mse: 0.8843 - rmse: 0.5764
<ipython-input-17-e1f1d6df3b5c>:8: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy test[col] = scaler.fit_transform(test[[col]])
24/24 [==============================] - 0s 4ms/step - loss: 0.5594 - mse: 0.9859 - rmse: 0.6459 evaluate: [0.5594185590744019, 0.9858525395393372, 0.645856499671936] GRU: Mean Absolute Error: 0.1925 Root Mean Square Error: 0.5808 Mean Square Error: 0.3373
anomalies: (93, 10)
###################################################### ####################### PM25 ########################### Testing shape: (744, 36, 1) 24/24 [==============================] - 0s 3ms/step - loss: 0.5780 - mse: 0.9452 - rmse: 0.6659
<ipython-input-17-e1f1d6df3b5c>:8: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
evaluate: [0.577990710735321, 0.9452165961265564, 0.665899395942688] GRU: Mean Absolute Error: 0.2015 Root Mean Square Error: 0.5331 Mean Square Error: 0.2842
anomalies: (101, 10)
###################################################### ####################### PM10 ########################### Testing shape: (744, 36, 1) 24/24 [==============================] - 0s 3ms/step - loss: 0.5907 - mse: 0.9300 - rmse: 0.6799
<ipython-input-17-e1f1d6df3b5c>:8: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
evaluate: [0.5906554460525513, 0.929969310760498, 0.6798847317695618] GRU: Mean Absolute Error: 0.2087 Root Mean Square Error: 0.4913 Mean Square Error: 0.2414
anomalies: (60, 10)
###################################################### ####################### PM1ATM ########################### Testing shape: (744, 36, 1) 24/24 [==============================] - 0s 3ms/step - loss: 0.5962 - mse: 0.9541 - rmse: 0.6910
<ipython-input-17-e1f1d6df3b5c>:8: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
evaluate: [0.5962411761283875, 0.9540941715240479, 0.6909937858581543] GRU: Mean Absolute Error: 0.2043 Root Mean Square Error: 0.4997 Mean Square Error: 0.2497
anomalies: (79, 10)
###################################################### ####################### PM25ATM ########################### Testing shape: (744, 36, 1) 24/24 [==============================] - 0s 3ms/step - loss: 0.5918 - mse: 0.9567 - rmse: 0.6857
<ipython-input-17-e1f1d6df3b5c>:8: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
evaluate: [0.5917696356773376, 0.9566550254821777, 0.6856705546379089] GRU: Mean Absolute Error: 0.2030 Root Mean Square Error: 0.5116 Mean Square Error: 0.2617
anomalies: (86, 10)
###################################################### ####################### PM10ATM ########################### Testing shape: (744, 36, 1) 24/24 [==============================] - 0s 3ms/step - loss: 0.5870 - mse: 0.9273 - rmse: 0.6758 evaluate: [0.5869718194007874, 0.9273125529289246, 0.6758256554603577]
<ipython-input-17-e1f1d6df3b5c>:8: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
GRU: Mean Absolute Error: 0.2073 Root Mean Square Error: 0.5150 Mean Square Error: 0.2652
anomalies: (84, 10)
######################################################